Added hook to tokenizer and to parser for language specific

author Jens Frank <jeluf@users.mediawiki.org>

Tue, 2 Mar 2004 20:23:56 +0000 (20:23 +0000)

committer Jens Frank <jeluf@users.mediawiki.org>

Tue, 2 Mar 2004 20:23:56 +0000 (20:23 +0000)
author Jens Frank <jeluf@users.mediawiki.org>
Tue, 2 Mar 2004 20:23:56 +0000 (20:23 +0000)
committer Jens Frank <jeluf@users.mediawiki.org>
Tue, 2 Mar 2004 20:23:56 +0000 (20:23 +0000)
diff --git a/includes/Parser.php b/includes/Parser.php

index 2e4e802..e92f6c3 100644 (file)
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -360,7 +360,7 @@ class Parser
                 $text = $this->removeHTMLtags( $text );
                 $text = $this->replaceVariables( $text );
  
-               $text = preg_replace( "/(^|\n)-----*/", "\\1<hr>", $text );
+               # $text = preg_replace( "/(^|\n)-----*/", "\\1<hr>", $text );
                 $text = str_replace ( "<HR>", "<hr>", $text );
  
                 $text = $this->doHeadings( $text );
@@ -542,6 +542,8 @@ class Parser
  
         /* private */ function replaceInternalLinks( $str )
         {
+               global $wgLang; # for language specific parser hook
+
                 $tokenizer=Tokenizer::newFromString( $str );
                 $tokenStack = array();
                 
@@ -596,6 +598,9 @@ class Parser
                                         }
                                         $tagIsOpen = (count( $tokenStack ) != 0);
                                         break;
+                               case "----":
+                                       $txt = "\n<hr>\n";
+                                       break;
                                 case "'''":
                                         # This and the three next ones handle quotes
                                         $txt = $this->handle3Quotes( $state, $token );
@@ -611,9 +616,13 @@ class Parser
                                         $txt="";
                                         break;
                                 default:
-                                       # An unkown token. Highlight.
-                                       $txt = "<font color=\"#FF0000\"><b>".$token["type"]."</b></font>";
-                                       $txt .= "<font color=\"#FFFF00\"><b>".$token["text"]."</b></font>";
+                                       # Call language specific Hook.
+                                       $txt = $wgLang->processToken( $token, $tokenStack );
+                                       if ( NULL == $txt ) {
+                                               # An unkown token. Highlight.
+                                               $txt = "<font color=\"#FF0000\"><b>".$token["type"]."</b></font>";
+                                               $txt .= "<font color=\"#FFFF00\"><b>".$token["text"]."</b></font>";
+                                       }
                                         break;
                         }
                         # If we're parsing the interior of a link, don't append the interior to $s,
diff --git a/includes/Tokenizer.php b/includes/Tokenizer.php

index d7eb080..beeda47 100644 (file)
--- a/includes/Tokenizer.php
+++ b/includes/Tokenizer.php
@@ -26,22 +26,27 @@ class Tokenizer {
         function preParse()
         {
                 global $wgLang;
+
+               # build up the regex, step by step.
+               # Basic features: Quotes for <em>/<strong> and hyphens for <hr>
+               $regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*";
+               # Append regex for linkPrefixExtension 
                 if (  $wgLang->linkPrefixExtension() ) {
-                       $regex = "/(([a-zA-Z\x80-\xff]+)\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/";
-                       #          000000000000000000000000000000000000000000000000000000
-                       #           1111111111111111111111111111111111111111111111111111
-                       #            222222222222222222
-                       # which $this->mMatch[...] will contain the match.
+                       $regex .= "|([a-zA-Z\x80-\xff]+)\[\[";
                 } else {
-                       $regex = "/(\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/";
+                       $regex .= "|\[\[";
                 }
+               # Closing link
+               $regex .= "|\]\]";
+               # Language-specific additions
+               $regex .= $wgLang->tokenizerRegex();
+               # Finalize regex
+               $regex = "/(" . $regex . ")/";
  
+               # Apply the regex to the text
                 $this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch,
                                                 PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
                 $this->mMatchPos=0;
-               # print( "<pre>" );
-               # print_r( $this->mMatch );
-               # print( "</pre>" );
         }
  
         function nextToken()
@@ -76,6 +81,12 @@ class Tokenizer {
                                         $token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix
                                 } else {
                                         $token["type"] = $this->mMatch[0][$this->mMatchPos][0];
+                                       if ( substr($token["type"],1,4) == "----" )
+                                       {
+                                               # any number of hyphens bigger than four is a <HR>. 
+                                               # strip down to four.
+                                               $token["type"]="----";
+                                       }
                                 }
                                 # What the pointers would change to if this would not just be a preview
                                 $token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] );
diff --git a/languages/Language.php b/languages/Language.php

index 100e4cf..df7e396 100644 (file)
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1732,6 +1732,20 @@ class Language {
         {
                 return "<em>$text</em>";
         }
+
+       # returns additional Regex for the tokenizer. See LanguageFr.php for an example
+       function tokenizerRegex()
+       {
+               return "";
+       }
+
+       # Process the token generated from the tokenizer by the above regex. Return
+       # NULL if the token is unknown, and the text to be added to the output otherwise
+       function processToken( &$token , &$tokenStack)
+       {
+               return NULL;
+       }
+
  }
  
  @include_once( "Language" . ucfirst( $wgLanguageCode ) . ".php" );
diff --git a/languages/LanguageFr.php b/languages/LanguageFr.php

index 2e1a858..86ee9a6 100644 (file)
--- a/languages/LanguageFr.php
+++ b/languages/LanguageFr.php
@@ -1066,6 +1066,32 @@ class LanguageFr extends Language
                 else return $m;
  
         }
+
+       # returns additional Regex for the tokenizer.
+       function tokenizerRegex()
+       {
+               return "| [:»!?]|« |[0-9] [0-9]";
+       }
+
+       # Process the token generated from the tokenizer by the above regex. Return
+       # NULL if the token is unknown, and the text to be added to the output otherwise
+       function processToken( &$token , &$tokenStack)
+       {
+               if ( preg_match( "/ ([:»!?])/", $token["type"], $m ) )
+               {
+                       $txt = "&nbsp;" . $m[1];
+               } elseif ( "« " == $token["type"] )
+               {
+                       $txt = "«&nbsp;";
+               } elseif ( preg_match( "/([0-9]) ([0-9])/", $token["type"], $m ) )
+               {
+                       $txt = $m[1] . "&nbsp;" . $m[2];
+               } else
+               {
+                       $txt = NULL;
+               }
+               return $txt;
+       }
  }
  
  ?>
author	Jens Frank <jeluf@users.mediawiki.org>
	Tue, 2 Mar 2004 20:23:56 +0000 (20:23 +0000)
committer	Jens Frank <jeluf@users.mediawiki.org>
	Tue, 2 Mar 2004 20:23:56 +0000 (20:23 +0000)
includes/Parser.php		patch \| blob \| history
includes/Tokenizer.php		patch \| blob \| history
languages/Language.php		patch \| blob \| history
languages/LanguageFr.php		patch \| blob \| history